Available files (in the data-generated folder and subfolders:
miappe_metadata.ttl: RDF metadata for 5 CE experimentsMIAPPE_CxE_v1.1.xlsx: Excel file with data and metadata - has been converted to miappe-metadata.ttlstation_metadata.ttl: information about the "weather stations" with the weather (mean day temperature, hours of sunlight) for the 5 CE experiments, but no actual dataweather folder:all_weather.ttl combines the content of all files in the weather folder.phenotypic folder:data_199NL.ttl, data_2003VE.ttl, data_2004Fin.ttl, data_2005Fin.ttl, data_2010ET.ttl# setup
import os
import random
import numpy as np
import numpy.polynomial.polynomial as poly
import rdflib
import requests
import SPARQLWrapper
import pandas as pd
from pandas.io.json import json_normalize
from datetime import datetime
from pprint import pprint
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
#from matplotlib.ticker import AutoMinorLocator
import plotly.express as px
import plotly.graph_objects as go
#from numpy.polynomial import Polynomial
pd.set_option('display.max_colwidth', -1) # or 199
data_loc = 'data-generated' + os.sep
data_weather = data_loc + 'weather' + os.sep
data_pheno = data_loc + 'phenotypic' + os.sep
# helper functions: query_DB, formula_btt, calculate_PBTT
def query_DB(sparql_query):
'''
Query the endpoint with the given query string and return the results as a pandas Dataframe.
'''
# create the connection to the endpoint
sparql_service_url = 'http://fuseki:3030/pheno/query'
#sparql_service_url = 'http://localhost:3030/pheno/query'
sparql = SPARQLWrapper.SPARQLWrapper(sparql_service_url, agent='Sparql Wrapper on Jupyter example')
sparql.setQuery(sparql_query)
sparql.setReturnFormat(SPARQLWrapper.JSON)
# ask for the result
result = sparql.query().convert()
return json_normalize(result['results']['bindings'])
def formula_btt(t):
# fixed values based on potato
tb = 5.5
to = 23.4
tc = 34.6
ct = 1.7
btt = (
((tc - t) / (tc - to)) *
((t - tb) / (to - tb)) **
((to - tb) / (tc - to))
) ** ct
try:
return btt if btt > 0 else 0
except:
return 0
def calculate_PBTT(df_w_cur):
df_w_cur = df_w_cur.dropna()
df_w_cur['mean_temperature'] = pd.to_numeric(df_w_cur['mean_temperature'])
df_w_cur['hours_of_sunlight'] = pd.to_numeric(df_w_cur['hours_of_sunlight'])
#df_w_cur = df_w_cur.where(pd.notnull(df_w_cur), None)
df_w_cur['BTT'] = df_w_cur.apply(
lambda row: formula_btt(row.mean_temperature),
axis = 1)
df_w_cur['day_fraction'] = df_w_cur.apply(
lambda row: row.hours_of_sunlight / 24.0,
axis = 1)
df_w_cur['PBTT'] = df_w_cur.apply(
lambda row: (row.BTT * row.day_fraction) if row.BTT > 0 else 0,
axis = 1)
df_w_cur['cum_PBTT'] = df_w_cur['PBTT'].cumsum()
cum_pbtt = df_w_cur.loc[df_w_cur.index[-1], 'cum_PBTT']
return df_w_cur, cum_pbtt
def autolabel(rects):
'''
Attach a text label above each bar displaying its height
taken from: from https://stackoverflow.com/questions/30228069/how-to-display-the-value-of-the-bar-on-each-bar-with-pyplot-barh
'''
for rect in rects:
height = rect.get_height()
ax.text(rect.get_x() + rect.get_width()/2., 1.05*height,
round(height,2),
ha='center', va='bottom')
def non_increasing(L):
return all(x>=y for x, y in zip(L, L[1:]))
def non_decreasing(L):
return all(x<=y for x, y in zip(L, L[1:]))
def monotonic(L):
return non_increasing(L) or non_decreasing(L)
We can start with the top MIAPPE object, the Investigation, and examine the properties it has:
# fetch investigation properties
datatable_query = (
'''
PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
SELECT distinct ?investigationProperty
WHERE {
?inv a ppeo:investigation .
?inv ?investigationProperty ?_
}
''')
print('Investigation properties:')
result = query_DB(datatable_query)
columnsToKeep = ['investigationProperty.value']
df_inv = result.loc[:, columnsToKeep]
for c in columnsToKeep:
df_inv.rename(inplace=True, columns={c: c[:-6]})
df_inv
Next, we can check the values of those properties. For those that are not plain literals, we also choose to show their class here (if given):
# fetch investigation properties and values (no reasoning for transitivity)
datatable_query = (
'''
PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
SELECT distinct ?investigationProperty ?target ?type
WHERE {
?inv a ppeo:investigation ;
?investigationProperty ?target .
OPTIONAL {?target a ?type}
}
ORDER BY ?investigationProperty
''')
result = query_DB(datatable_query)
columnsToKeep = ['investigationProperty.value',
'target.value', 'type.value']
df_inv = result.loc[:, columnsToKeep]
for c in columnsToKeep:
df_inv.rename(inplace=True, columns={c: c[:-6]})
df_inv
We can choose to take a look at studies, the next object in the MIAPPE hierarchy, and look at some of their properties.
# fetch study properties
datatable_query = (
'''
PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
SELECT distinct ?studyId ?cntName ?addr ?locName ?startDate ?endDate ?lat ?long ?alt
WHERE {
?study a ppeo:study ;
ppeo:hasIdentifier ?studyId ;
ppeo:hasStartDateTime ?startDate ;
ppeo:hasEndDateTime ?endDate ;
ppeo:hasLocation ?studyloc .
?studyloc ppeo:hasCountry ?cnt ;
ppeo:hasLatitude ?lat ;
ppeo:hasLongitude ?long ;
ppeo:hasAltitude ?alt ;
ppeo:hasAddress ?addr ;
ppeo:hasName ?locName .
?cnt ppeo:hasName ?cntName .
}
ORDER BY ?studyId
''')
result = query_DB(datatable_query)
columnsToKeep = ['studyId.value', 'cntName.value', 'locName.value', 'addr.value', 'startDate.value',
'endDate.value', 'lat.value', 'long.value', 'alt.value']
df_studies = result.loc[:, columnsToKeep]
for c in columnsToKeep:
df_studies.rename(inplace=True, columns={c: c[:-6]})
df_studies
We can easily find the genotypes (biological materials) that were part of all 5 studies:
# common genotypes across 5 studies (intersection across experiments)
datatable_query = (
'''
PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
SELECT distinct ?bio_mat_id
WHERE {
?study1 a ppeo:study ;
ppeo:hasBiologicalMaterial ?bio_mat ;
ppeo:hasIdentifier '1999NL' .
?study2 a ppeo:study ;
ppeo:hasBiologicalMaterial ?bio_mat ;
ppeo:hasIdentifier '2005Fin' .
?study3 a ppeo:study ;
ppeo:hasBiologicalMaterial ?bio_mat ;
ppeo:hasIdentifier '2003VE' .
?study4 a ppeo:study ;
ppeo:hasBiologicalMaterial ?bio_mat ;
ppeo:hasIdentifier '2004Fin' .
?study5 a ppeo:study ;
ppeo:hasBiologicalMaterial ?bio_mat ;
ppeo:hasIdentifier '2010ET' .
?bio_mat ppeo:hasIdentifier ?bio_mat_id .
}
ORDER BY ?bio_mat_id
''')
result = query_DB(datatable_query)
columnsToKeep = ['bio_mat_id.value']
df_geno = result.loc[:, columnsToKeep]
print(str(len(df_geno)) + ' genotypes overlap:')
for c in columnsToKeep:
df_geno.rename(inplace=True, columns={c: c[:-6]})
df_geno.head()
We can also calculate the total number of genotypes, i.e. the ones that were studied in at least one experiment:
# all genotypes (union across experiments)
datatable_query = (
'''
PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
SELECT distinct ?bio_mat_id
WHERE {
?study1 a ppeo:study ;
ppeo:hasBiologicalMaterial ?bio_mat .
?bio_mat ppeo:hasIdentifier ?bio_mat_id .
}
ORDER BY ?bio_mat_id
''')
result = query_DB(datatable_query)
columnsToKeep = ['bio_mat_id.value']
df_geno = result.loc[:, columnsToKeep]
print(str(len(df_geno)) + ' genotypes in total:')
for c in columnsToKeep:
df_geno.rename(inplace=True, columns={c: c[:-6]})
df_geno.head()
# genotypes per experiment
datatable_query = (
'''
PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
SELECT (?study_id as ?Study_ID) (count(distinct ?bio_mat_id) as ?genotypes)
WHERE {
?study a ppeo:study ;
ppeo:hasBiologicalMaterial ?bio_mat ;
ppeo:hasIdentifier ?study_id .
?bio_mat ppeo:hasIdentifier ?bio_mat_id .
} GROUP BY ?study_id
''')
result = query_DB(datatable_query)
columnsToKeep = ['Study_ID.value', 'genotypes.value']
df_geno = result.loc[:, columnsToKeep]
print('Genotypes per experiment:')
for c in columnsToKeep:
df_geno.rename(inplace=True, columns={c: c[:-6]})
df_geno.head()
# pull stations from /weather with service
datatable_query = (
'''
PREFIX aemet: <http://aemet.linkeddata.es/ontology/>
PREFIX geo: <http://www.w3.org/2003/01/geo/wgs84_pos#>
PREFIX ssn: <http://purl.oclc.org/NET/ssnx/ssn#>
PREFIX w3ctime: <http://www.w3.org/2006/time#>
PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT distinct ?name ?latitude ?longitude ?altitude
WHERE {
SERVICE <http://localhost:3030/weather/query> {
?ws a aemet:WeatherStation ;
aemet:stationName ?name ;
geo:location ?loc .
?loc geo:lat ?latitude ;
geo:long ?longitude ;
geo:alt ?altitude .
}
} ORDER BY ASC(?obsTime)
''')
result = query_DB(datatable_query)
columnsToKeep = ['name.value', 'latitude.value',
'longitude.value', 'altitude.value']
df_stations = result[columnsToKeep]
for c in columnsToKeep:
df_stations.rename(inplace=True, columns={c: c[:-6]})
df_stations
# pull NL sunhours from /weather with service
datatable_query = (
'''
PREFIX aemet: <http://aemet.linkeddata.es/ontology/>
PREFIX geo: <http://www.w3.org/2003/01/geo/wgs84_pos#>
PREFIX ssn: <http://purl.oclc.org/NET/ssnx/ssn#>
PREFIX w3ctime: <http://www.w3.org/2006/time#>
PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
SELECT ?obsTime ?obsVal
WHERE {
SERVICE <http://localhost:3030/weather/query> {
?obs a aemet:Observation ;
aemet:valueOfObservedData ?obsVal ;
aemet:observedInInterval ?obsTime ;
ssn:observedProperty ?var ;
ssn:observedBy ?station .
?station aemet:stationName ?obsStation .
FILTER("WUR station"=?obsStation) .
FILTER(?obsTime > "1999-05-11"^^xsd:date &&
?obsTime < "1999-11-15"^^xsd:date)
?var a aemet:AmbientProperty ;
rdfs:label "Hours of sunlight" .
}
} ORDER BY ASC(?obsTime)
''')
result = query_DB(datatable_query)
columnsToKeep = ['obsTime.value', 'obsVal.value']
df_nl_sun = result[columnsToKeep]
for c in columnsToKeep:
df_nl_sun.rename(inplace=True, columns={c: c[:-6]})
display(df_nl_sun.head())
Plot (hours of sunlight):
# make plot
df_nl_sun.loc[:, 'obsTime'] = pd.to_datetime(df_nl_sun['obsTime'])
df_nl_sun.loc[:, 'obsVal'] = pd.to_numeric(df_nl_sun['obsVal'], downcast="float")
pd.plotting.register_matplotlib_converters()
fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(df_nl_sun['obsTime'], df_nl_sun['obsVal'])
ax.set_title('Daily hours of sunlight (NL)')
plt.ylabel('Hours per day')
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=1))
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
# check distances from experimental locations to weather station coordinates
datatable_query = (
'''
PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
PREFIX aemet: <http://aemet.linkeddata.es/ontology/>
PREFIX geo: <http://www.w3.org/2003/01/geo/wgs84_pos#>
PREFIX ssn: <http://purl.oclc.org/NET/ssnx/ssn#>
PREFIX w3ctime: <http://www.w3.org/2006/time#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT distinct ?studyId ?cntName ?stu_locName ?stu_lat ?stu_long ?w_name ?w_lat ?w_long ?dist_sq
WHERE {
?study a ppeo:study ;
ppeo:hasIdentifier ?studyId ;
ppeo:hasLocation ?studyloc .
?studyloc ppeo:hasCountry ?cnt ;
ppeo:hasLatitude ?stu_lat ;
ppeo:hasLongitude ?stu_long ;
ppeo:hasName ?stu_locName .
?cnt ppeo:hasName ?cntName .
SERVICE <http://localhost:3030/weather/query> {
?ws a aemet:WeatherStation ;
aemet:stationName ?w_name ;
geo:location ?w_loc .
?w_loc geo:lat ?w_lat ; geo:long ?w_long .
}
BIND(((?stu_lat - ?w_lat) * (?stu_lat - ?w_lat)) AS ?lat_diff_sq)
BIND(((?stu_long - ?w_long) * (?stu_long - ?w_long)) AS ?long_diff_sq)
BIND((?lat_diff_sq + ?long_diff_sq) AS ?dist_sq)
}
ORDER BY ?dist_sq
''')
result = query_DB(datatable_query)
columnsToKeep = ['studyId.value', 'cntName.value', 'stu_locName.value', 'stu_lat.value', 'stu_long.value',
'w_name.value', 'w_lat.value', 'w_long.value', 'dist_sq.value']
df_distances = result[columnsToKeep]
for c in columnsToKeep:
df_distances.rename(inplace=True, columns={c: c[:-6]})
display(df_distances)
We can view the variables in a data file for a study as follows:
# variables for 1999NL
datatable_query = ('''
PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
SELECT distinct ?filename ?varName
WHERE {
?file ppeo:hasObservation ?obs ;
ppeo:hasDigitalLocation ?filename .
?study ppeo:hasDataFile ?file ;
ppeo:hasIdentifier '1999NL' .
?obs ppeo:hasVariable ?var .
?study ppeo:hasPart ?var .
?var ppeo:hasIdentifier ?varName .
}
''')
result = query_DB(datatable_query)
columnsToKeep = ['filename.value', 'varName.value']
df_xvars = result[columnsToKeep]
for c in columnsToKeep:
df_xvars.rename(inplace=True, columns={c: c[:-6]})
df_xvars
Create the data table for this study. We will also add a column to hold the genotypes for later use.
(The code for putting SPARQLWrapper output into a dataframe is based on this page)
# 1999NL data table
# endpoint: http://localhost:7200/repositories/miappe_20'
datatable_query = (
'''
PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
SELECT DISTINCT ?filename ?date ?ouID ?bmID ?TubN_total_per_plant ?TubW_total_per_plant # ?TubN_average_per_genotype ?TubW_average_per_genotype
WHERE {
?study a ppeo:study ;
ppeo:hasIdentifier '1999NL' ;
ppeo:hasDataFile ?file .
?file a ppeo:data_file ;
ppeo:hasDigitalLocation ?filename .
?file ppeo:hasObservation ?obs1 .
?var1 a ppeo:observed_variable ;
ppeo:hasIdentifier 'TubN_total_per_plant' .
?obs1 ppeo:hasVariable ?var1 ;
ppeo:hasDateTime ?date ;
ppeo:hasObservedSubject ?ou ;
ppeo:hasValue ?TubN_total_per_plant .
OPTIONAL {?file ppeo:hasObservation ?obs2 .
?var2 a ppeo:observed_variable ;
ppeo:hasIdentifier 'TubW_total_per_plant' .
?obs2 ppeo:hasVariable ?var2 ;
ppeo:hasDateTime ?date ;
ppeo:hasObservedSubject ?ou ;
ppeo:hasValue ?TubW_total_per_plant .}
# OPTIONAL {?file ppeo:hasObservation ?obs3 .
# ?var3 a ppeo:observed_variable ;
# ppeo:hasIdentifier 'TubN_average_per_genotype' .
# ?obs3 ppeo:hasVariable ?var3 ;
# ppeo:hasDateTime ?date ;
# ppeo:hasObservedSubject ?ou ;
# ppeo:hasValue ?TubN_average_per_genotype .}
#
# OPTIONAL {?file ppeo:hasObservation ?obs4 .
# ?var4 a ppeo:observed_variable ;
# ppeo:hasIdentifier 'TubW_average_per_genotype' .
# ?obs4 ppeo:hasVariable ?var4 ;
# ppeo:hasDateTime ?date ;
# ppeo:hasObservedSubject ?ou ;
# ppeo:hasValue ?TubW_average_per_genotype .}
?ou a ppeo:observation_unit ;
ppeo:hasIdentifier ?ouID ;
ppeo:hasBiologicalMaterial ?bm .
?bm ppeo:hasIdentifier ?bmID .
}
ORDER BY ASC(?date) ASC(?ouID)
''')
result = query_DB(datatable_query)
columnsToKeep = ['filename.value', 'date.value', 'ouID.value', 'bmID.value',
'TubN_total_per_plant.value', 'TubW_total_per_plant.value']
datatable_1999NL = result[columnsToKeep]
for c in columnsToKeep:
datatable_1999NL.rename(inplace=True, columns={c: c[:-6]})
datatable_1999NL.loc[:, 'TubW_total_per_plant'] = pd.to_numeric(datatable_1999NL['TubW_total_per_plant'], downcast="float")
datatable_1999NL_perPlant = datatable_1999NL.groupby('bmID')['TubW_total_per_plant'].mean()
display(datatable_1999NL.head())
display(datatable_1999NL.describe())
# calculate average weight per genotype
# 1999NL data - averaged
# endpoint: http://localhost:7200/repositories/miappe_20'
datatable_query = (
'''
PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
SELECT DISTINCT ?bmID (avg(?TubW_total_per_plant) as ?TubW_T_avg)
WHERE {
?study a ppeo:study ;
ppeo:hasIdentifier '1999NL' ;
ppeo:hasDataFile ?file .
?file a ppeo:data_file ;
ppeo:hasDigitalLocation ?filename ;
ppeo:hasObservation ?obs1 .
?var1 a ppeo:observed_variable ;
ppeo:hasIdentifier 'TubW_total_per_plant' .
?obs1 ppeo:hasVariable ?var1 ;
ppeo:hasValue ?TubW_total_per_plant ;
ppeo:hasDateTime ?date ;
ppeo:hasObservedSubject ?ou .
?ou a ppeo:observation_unit ;
ppeo:hasIdentifier ?ouID ;
ppeo:partOf ?study ;
ppeo:hasBiologicalMaterial ?bm .
?bm ppeo:hasIdentifier ?bmID .
}
GROUP BY ?bmID
''')
result = query_DB(datatable_query)
columnsToKeep = ['bmID.value', 'TubW_T_avg.value']
datatable_1999NL_avg = result[columnsToKeep]
for c in columnsToKeep:
datatable_1999NL_avg.rename(inplace=True, columns={c: c[:-6]})
datatable_1999NL_avg.head()
Once again, when building the data table, we will also add a column to hold the genotypes for later use.
# 2003VE data table
# endpoint: http://localhost:7200/repositories/miappe_20'
datatable_query = (
'''
PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
SELECT DISTINCT ?filename ?date ?ouID ?bmID ?TubN_average_per_genotype ?TubW_average_per_genotype
WHERE {
?study a ppeo:study ;
ppeo:hasIdentifier '2003VE' ;
ppeo:hasDataFile ?file .
?file a ppeo:data_file ;
ppeo:hasDigitalLocation ?filename .
OPTIONAL {?file ppeo:hasObservation ?obs1 .
?var1 a ppeo:observed_variable ;
ppeo:hasIdentifier 'TubN_average_per_genotype' .
?obs1 ppeo:hasVariable ?var1 ;
ppeo:hasDateTime ?date ;
ppeo:hasObservedSubject ?ou ;
ppeo:hasValue ?TubN_average_per_genotype .}
OPTIONAL {?file ppeo:hasObservation ?obs2 .
?var2 a ppeo:observed_variable ;
ppeo:hasIdentifier 'TubW_average_per_genotype' .
?obs2 ppeo:hasVariable ?var2 ;
ppeo:hasDateTime ?date ;
ppeo:hasObservedSubject ?ou ;
ppeo:hasValue ?TubW_average_per_genotype .}
?ou a ppeo:observation_unit ;
ppeo:hasIdentifier ?ouID ;
ppeo:partOf ?study ;
ppeo:hasBiologicalMaterial ?bm .
?bm ppeo:hasIdentifier ?bmID .
}
ORDER BY ASC(?date) ASC(?ouID)
''')
result = query_DB(datatable_query)
columnsToKeep = ['filename.value', 'date.value', 'ouID.value', 'bmID.value',
'TubN_average_per_genotype.value', 'TubW_average_per_genotype.value']
datatable_2003VE = result[columnsToKeep]
for c in columnsToKeep:
datatable_2003VE.rename(inplace=True, columns={c: c[:-6]})
datatable_2003VE.loc[:, 'TubW_average_per_genotype'] = pd.to_numeric(datatable_2003VE['TubW_average_per_genotype'], downcast="float")
datatable_2003VE_perPlant = datatable_2003VE.groupby('bmID')['TubW_average_per_genotype'].mean()
display(datatable_2003VE.head())
display(datatable_2003VE.describe())
The data for Ethiopia is sparse, so constructing the full table with SPARQL is inconvenient. We only get the tuber weight per plant:
# 2010ET data table
# endpoint: http://localhost:7200/repositories/miappe_20'
datatable_query = (
'''
PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
SELECT DISTINCT ?filename ?date ?ouID ?TubW_total_per_plant
WHERE {
?study a ppeo:study ;
ppeo:hasIdentifier '2010ET' ;
ppeo:hasDataFile ?file .
?file a ppeo:data_file ;
ppeo:hasDigitalLocation ?filename ;
ppeo:hasObservation ?obs1 .
?var1 a ppeo:observed_variable ;
ppeo:hasIdentifier 'TubW_total_per_plant' .
?obs1 ppeo:hasVariable ?var1 ;
ppeo:hasValue ?TubW_total_per_plant ;
ppeo:hasDateTime ?date ;
ppeo:hasObservedSubject ?ou .
?ou a ppeo:observation_unit ;
ppeo:hasIdentifier ?ouID ;
ppeo:partOf ?study.
}
ORDER BY ASC(?date) ASC(?ouID)
''')
result = query_DB(datatable_query)
columnsToKeep = ['filename.value', 'date.value', 'ouID.value',
'TubW_total_per_plant.value']
datatable_2010ET = result[columnsToKeep]
for c in columnsToKeep:
datatable_2010ET .rename(inplace=True, columns={c: c[:-6]})
datatable_2010ET.head()
At the same time, we only have the average tuber weight per genotype for the Netherlands and Venezuela.
We can calculate the same thing for Ethiopia and Finland. (new table datatable_2010ET2)
# 2010ET data - averaged
# endpoint: http://localhost:7200/repositories/miappe_20'
datatable_query = (
'''
PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
SELECT DISTINCT ?bmID (avg(?TubW_total_per_plant) as ?TubW_T_avg)
WHERE {
?study a ppeo:study ;
ppeo:hasIdentifier '2010ET' ;
ppeo:hasDataFile ?file .
?file a ppeo:data_file ;
ppeo:hasDigitalLocation ?filename ;
ppeo:hasObservation ?obs1 .
?var1 a ppeo:observed_variable ;
ppeo:hasIdentifier 'TubW_total_per_plant' .
?obs1 ppeo:hasVariable ?var1 ;
ppeo:hasValue ?TubW_total_per_plant ;
ppeo:hasDateTime ?date ;
ppeo:hasObservedSubject ?ou .
?ou a ppeo:observation_unit ;
ppeo:hasIdentifier ?ouID ;
ppeo:partOf ?study ;
ppeo:hasBiologicalMaterial ?bm .
?bm ppeo:hasIdentifier ?bmID .
}
GROUP BY ?bmID
''')
result = query_DB(datatable_query)
columnsToKeep = ['bmID.value', 'TubW_T_avg.value']
datatable_2010ET_avg = result[columnsToKeep]
for c in columnsToKeep:
datatable_2010ET_avg.rename(inplace=True, columns={c: c[:-6]})
datatable_2010ET_avg.head()
The data for Finland is sparse, so constructing the full table with SPARQL is inconvenient. We only get the tuber weight per plant, for block 8 (the one that was harvested last):
# 2004Fin data table
datatable_query = (
'''
PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
SELECT DISTINCT ?filename ?date ?ouID ?TubW_total_per_plant
WHERE {
?study a ppeo:study ;
ppeo:hasIdentifier '2004Fin' ;
ppeo:hasDataFile ?file .
?file a ppeo:data_file ;
ppeo:hasDigitalLocation ?filename ;
ppeo:hasObservation ?obs1 .
?var1 a ppeo:observed_variable ;
ppeo:hasIdentifier 'TubW_total_per_plant' .
?obs1 ppeo:hasVariable ?var1 ;
ppeo:hasValue ?TubW_total_per_plant ;
ppeo:hasDateTime ?date ;
ppeo:hasObservedSubject ?ou .
?ou a ppeo:observation_unit ;
ppeo:hasIdentifier ?ouID ;
ppeo:partOf ?study;
ppeo:hasSpatialDistribution ?sd .
?sd ppeo:hasSpatialDistributionType ?sdtype .
?sd ppeo:hasValue "8" .
?sdtype ppeo:hasType "block" .
}
ORDER BY ASC(?date) ASC(?ouID)
''')
result = query_DB(datatable_query)
columnsToKeep = ['filename.value', 'date.value', 'ouID.value',
'TubW_total_per_plant.value']
datatable_2004Fin = result[columnsToKeep]
for c in columnsToKeep:
datatable_2004Fin .rename(inplace=True, columns={c: c[:-6]})
# divide by 3 since there were 3 plants
datatable_2004Fin.loc[:, 'TubW_total_per_plant'] = pd.to_numeric(datatable_2004Fin['TubW_total_per_plant'], downcast="float") / 3
datatable_2004Fin.head()
Calculating the average tuber weight per genotype:
# 2004Fin data - averaged
# endpoint: http://localhost:7200/repositories/miappe_20'
datatable_query = (
'''
PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
SELECT DISTINCT ?bmID (avg(?TubW_total_per_plant) as ?TubW_T_avg)
WHERE {
?study a ppeo:study ;
ppeo:hasIdentifier '2004Fin' ;
ppeo:hasDataFile ?file .
?file a ppeo:data_file ;
ppeo:hasDigitalLocation ?filename ;
ppeo:hasObservation ?obs1 .
?var1 a ppeo:observed_variable ;
ppeo:hasIdentifier 'TubW_total_per_plant' .
?obs1 ppeo:hasVariable ?var1 ;
ppeo:hasValue ?TubW_total_per_plant ;
ppeo:hasDateTime ?date ;
ppeo:hasObservedSubject ?ou .
?ou a ppeo:observation_unit ;
ppeo:hasIdentifier ?ouID ;
ppeo:partOf ?study ;
ppeo:hasBiologicalMaterial ?bm ;
ppeo:hasSpatialDistribution ?sd .
?sd ppeo:hasSpatialDistributionType ?sdtype .
?sd ppeo:hasValue "8" .
?sdtype ppeo:hasType "block" .
?bm ppeo:hasIdentifier ?bmID .
}
GROUP BY ?bmID
''')
result = query_DB(datatable_query)
columnsToKeep = ['bmID.value', 'TubW_T_avg.value']
datatable_2004Fin_avg = result[columnsToKeep]
for c in columnsToKeep:
datatable_2004Fin_avg.rename(inplace=True, columns={c: c[:-6]})
datatable_2004Fin_avg.loc[:, 'TubW_T_avg_perPlant'] = pd.to_numeric(datatable_2004Fin_avg['TubW_T_avg'], downcast="float") / 3
datatable_2004Fin_avg.head()
# 2004Findata - plant height
# endpoint: http://localhost:7200/repositories/miappe_20'
datatable_query = ('''
PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
SELECT DISTINCT ?date ?ouID ?bmID ?plantHeight
WHERE {
?study a ppeo:study ;
ppeo:hasIdentifier '2004Fin' ;
ppeo:hasDataFile ?file .
?file a ppeo:data_file ;
ppeo:hasDigitalLocation ?filename ;
ppeo:hasObservation ?obs1 .
?var1 a ppeo:observed_variable ;
ppeo:hasIdentifier 'PlantHeight' .
?obs1 ppeo:hasVariable ?var1 ;
ppeo:hasValue ?plantHeight ;
ppeo:hasDateTime ?date ;
ppeo:hasObservedSubject ?ou .
?ou a ppeo:observation_unit ;
ppeo:hasIdentifier ?ouID ;
ppeo:partOf ?study;
ppeo:hasBiologicalMaterial ?bm .
?bm ppeo:hasIdentifier ?bmID .
}
ORDER BY ASC(?date) ASC(?ouID)
''')
result = query_DB(datatable_query)
columnsToKeep = ['date.value', 'ouID.value', 'bmID.value', 'plantHeight.value']
datatable_2004Fin_ph = result[columnsToKeep]
for c in columnsToKeep:
datatable_2004Fin_ph.rename(inplace=True, columns={c: c[:-6]})
datatable_2004Fin_ph.head()
# get mean for each genotype:
datatable_2004Fin_ph2 = datatable_2004Fin_ph[datatable_2004Fin_ph.plantHeight != '*']
datatable_2004Fin_ph2.loc[:, 'plantHeight'] = pd.to_numeric(datatable_2004Fin_ph2['plantHeight'], downcast="float")
# averages per plot:
datatable_2004Fin_ph3 = datatable_2004Fin_ph2.assign(plot=datatable_2004Fin_ph2['ouID'].str[:-2])
datatable_2004Fin_ph3['bmDateOu'] = (datatable_2004Fin_ph3['bmID'] + ';' + datatable_2004Fin_ph3['date'] +
';' + datatable_2004Fin_ph3['plot'])
datatable_2004Fin_ph_perPlant = datatable_2004Fin_ph3.groupby('bmDateOu')['plantHeight'].mean()
datatable_2004Fin_ph_perPlant = datatable_2004Fin_ph_perPlant.to_frame().reset_index()
datatable_2004Fin_ph_perPlant[['bmID','Date','plot']] = datatable_2004Fin_ph_perPlant['bmDateOu'].str.split(';',expand=True,)
datatable_2004Fin_ph_perPlant = datatable_2004Fin_ph_perPlant.drop('bmDateOu', 1)
# averages per genotype:
datatable_2004Fin_ph4 = datatable_2004Fin_ph2
datatable_2004Fin_ph4['bmDate'] = (datatable_2004Fin_ph2['bmID'] + ';' + datatable_2004Fin_ph2['date'])
datatable_2004Fin_ph_perPlant4 = datatable_2004Fin_ph4.groupby('bmDate')['plantHeight'].mean()
datatable_2004Fin_ph_perPlant4 = datatable_2004Fin_ph_perPlant4.to_frame().reset_index()
datatable_2004Fin_ph_perPlant4[['bmID','Date']] = datatable_2004Fin_ph_perPlant4['bmDate'].str.split(';',expand=True,)
datatable_2004Fin_ph_perPlant4 = datatable_2004Fin_ph_perPlant4.drop('bmDate', 1)
datatable_2004Fin_ph_perPlant4.loc[:, 'Date'] = pd.to_datetime(datatable_2004Fin_ph_perPlant4['Date'])
datatable_2004Fin_ph_perPlant4['planting'] = pd.to_datetime('2004-06-01') # planting date: 16 july 2010
datatable_2004Fin_ph_perPlant4['DAP'] = (datatable_2004Fin_ph_perPlant4['Date'] -
datatable_2004Fin_ph_perPlant4['planting']).dt.days
datatable_2004Fin_ph_perPlant4.head()
The data for Finland is sparse, so constructing the full table with SPARQL is inconvenient. We only get the tuber weight per plant:
# 2005Fin data table
# endpoint: http://localhost:7200/repositories/miappe_20'
datatable_query = (
'''
PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
SELECT DISTINCT ?filename ?date ?ouID ?TubW_total_per_plant
WHERE {
?study a ppeo:study ;
ppeo:hasIdentifier '2005Fin' ;
ppeo:hasDataFile ?file .
?file a ppeo:data_file ;
ppeo:hasDigitalLocation ?filename ;
ppeo:hasObservation ?obs1 .
?var1 a ppeo:observed_variable ;
ppeo:hasIdentifier 'TubW_total_per_plant' .
?obs1 ppeo:hasVariable ?var1 ;
ppeo:hasValue ?TubW_total_per_plant ;
ppeo:hasDateTime ?date ;
ppeo:hasObservedSubject ?ou .
?ou a ppeo:observation_unit ;
ppeo:hasIdentifier ?ouID ;
ppeo:partOf ?study;
ppeo:hasSpatialDistribution ?sd .
?sd ppeo:hasSpatialDistributionType ?sdtype .
?sd ppeo:hasValue "IV" .
?sdtype ppeo:hasType "block" .
}
ORDER BY ASC(?date) ASC(?ouID)
''')
result = query_DB(datatable_query)
columnsToKeep = ['filename.value', 'date.value', 'ouID.value',
'TubW_total_per_plant.value']
datatable_2005Fin = result[columnsToKeep]
for c in columnsToKeep:
datatable_2005Fin .rename(inplace=True, columns={c: c[:-6]})
datatable_2005Fin.head()
Calculating the average tuber weight per genotype:
# 2005Fin data - averaged
# endpoint: http://localhost:7200/repositories/miappe_20'
datatable_query = (
'''
PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
SELECT DISTINCT ?bmID (avg(?TubW_total_per_plant) as ?TubW_T_avg)
WHERE {
?study a ppeo:study ;
ppeo:hasIdentifier '2005Fin' ;
ppeo:hasDataFile ?file .
?file a ppeo:data_file ;
ppeo:hasDigitalLocation ?filename ;
ppeo:hasObservation ?obs1 .
?var1 a ppeo:observed_variable ;
ppeo:hasIdentifier 'TubW_total_per_plant' .
?obs1 ppeo:hasVariable ?var1 ;
ppeo:hasValue ?TubW_total_per_plant ;
ppeo:hasDateTime ?date ;
ppeo:hasObservedSubject ?ou .
?ou a ppeo:observation_unit ;
ppeo:hasIdentifier ?ouID ;
ppeo:partOf ?study;
ppeo:hasBiologicalMaterial ?bm ;
ppeo:hasSpatialDistribution ?sd .
?sd ppeo:hasSpatialDistributionType ?sdtype .
?sd ppeo:hasValue "IV" .
?sdtype ppeo:hasType "block" .
?bm ppeo:hasIdentifier ?bmID .
}
GROUP BY ?bmID
''')
result = query_DB(datatable_query)
columnsToKeep = ['bmID.value', 'TubW_T_avg.value']
datatable_2005Fin_avg = result[columnsToKeep]
for c in columnsToKeep:
datatable_2005Fin_avg.rename(inplace=True, columns={c: c[:-6]})
datatable_2005Fin_avg.head()
# for average weight per genotype
datatable_2004Fin_avg2 = datatable_2004Fin_avg.drop('TubW_T_avg', axis=1)
finland_merge = pd.merge(datatable_2004Fin_avg2.rename(columns={'TubW_T_avg_perPlant': '2004Fin'}),
datatable_2005Fin_avg.rename(columns={'TubW_T_avg': '2005Fin'}), on= 'bmID')
fin_et = pd.merge(datatable_2010ET_avg.rename(columns={'TubW_T_avg': '2010ET'}),
finland_merge, on='bmID')
fin_et_ven = pd.merge(datatable_2003VE_perPlant, fin_et, on='bmID')
fin_et_ven.rename(columns={'TubW_average_per_genotype': '2003VE'}, inplace=True)
fin_et_ven_nl = pd.merge(datatable_1999NL_perPlant, fin_et_ven, on='bmID')
fin_et_ven_nl.rename(columns={'TubW_total_per_plant': '1999NL'}, inplace=True)
df_averageWeightPerGenotype = fin_et_ven_nl
df_averageWeightPerGenotype[
['1999NL', '2003VE', '2010ET', '2004Fin', '2005Fin']
] = df_averageWeightPerGenotype[
['1999NL', '2003VE', '2010ET', '2004Fin', '2005Fin']
].apply(pd.to_numeric)
df_averageWeightPerGenotype = df_averageWeightPerGenotype.sort_values('bmID')
display(df_averageWeightPerGenotype.head())
display(df_averageWeightPerGenotype.describe())
# pd.set_option('display.max_rows', 300)
# plotly graph for overlapping genotypes
colors = px.colors.qualitative.Plotly
layout = go.Layout(
legend=dict(
x=0, y=1,
orientation='h',
title_font_family='Tahoma',
font=dict(family='Tahoma', size=9, color='black'),
bgcolor='lightsteelblue', bordercolor='grey', borderwidth=1
),
width=1000,
height=450,
title='Tuber weight per genotype (averaged by plant)',
xaxis={'title':'Genotype', 'dtick':1, 'tickangle': 85,
'rangemode': 'tozero', 'range': [-1, len(df_averageWeightPerGenotype) + 1]},
yaxis={'title':'plant tuber weight (g)', 'dtick': 200},
margin=dict(l=20, r=0, t=40, b=10),
paper_bgcolor='lightsteelblue',
)
fig = go.Figure(layout=layout)
for (exp, col) in zip(['1999NL', '2003VE', '2004Fin', '2005Fin', '2010ET'], colors[:5]):
fig.add_traces(go.Scatter(x=df_averageWeightPerGenotype['bmID'],
y=df_averageWeightPerGenotype[exp],
mode='lines+markers', line=dict(color=col), name=exp))
fig.show()
# for average weight per genotype
all_datatable_2004Fin_avg2 = datatable_2004Fin_avg.drop('TubW_T_avg', axis=1)
all_finland_merge = pd.merge(all_datatable_2004Fin_avg2.rename(columns={'TubW_T_avg_perPlant': '2004Fin'}),
datatable_2005Fin_avg.rename(columns={'TubW_T_avg': '2005Fin'}),
on= 'bmID', how='outer')
all_fin_et = pd.merge(datatable_2010ET_avg.rename(columns={'TubW_T_avg': '2010ET'}),
all_finland_merge, on='bmID', how='outer')
all_fin_et_ven = pd.merge(datatable_2003VE_perPlant, all_fin_et, on='bmID', how='outer')
all_fin_et_ven.rename(columns={'TubW_average_per_genotype': '2003VE'}, inplace=True)
all_fin_et_ven_nl = pd.merge(datatable_1999NL_perPlant, all_fin_et_ven, on='bmID', how='outer')
all_fin_et_ven_nl.rename(columns={'TubW_total_per_plant': '1999NL'}, inplace=True)
df_averageWeightPerGenotype_all = all_fin_et_ven_nl
df_averageWeightPerGenotype_all[
['1999NL', '2003VE', '2010ET', '2004Fin', '2005Fin']
] = df_averageWeightPerGenotype_all[
['1999NL', '2003VE', '2010ET', '2004Fin', '2005Fin']
].apply(pd.to_numeric)
df_averageWeightPerGenotype_all = df_averageWeightPerGenotype_all.sort_values('bmID')
display(df_averageWeightPerGenotype_all.head())
display(df_averageWeightPerGenotype_all.describe())
pd.set_option('display.max_rows', 300)
# registered genotypes per study:
# 2005Fin: 260, 2003VE: 107, 2010ET: 190, 2004Fin: 251, 1999NL: 242
# plotly graph for all genotypes
layout = go.Layout(
legend=dict(
x=0, y=1,
orientation='h',
title_font_family='Tahoma',
font=dict(family='Tahoma', size=9, color='black'),
bgcolor='lightsteelblue', bordercolor='grey', borderwidth=1
),
width=1200, #3200 is a good size for all labels - when dtick = 1
height=450,
title='Tuber weight per genotype (averaged by plant)',
xaxis={'title':'Genotype', 'dtick':3, 'tickangle': 85,
'rangemode': 'tozero', 'range': [-2, len(df_averageWeightPerGenotype_all) + 2]},
yaxis={'title':'plant tuber weight (g)', 'dtick': 200},
margin=dict(l=20, r=0, t=40, b=10),
paper_bgcolor='lightsteelblue',
)
fig = go.Figure(layout=layout)
for (exp, col) in zip(['1999NL', '2003VE', '2004Fin', '2005Fin', '2010ET'], colors[:5]):
fig.add_traces(go.Scatter(x=df_averageWeightPerGenotype_all['bmID'],
y=df_averageWeightPerGenotype_all[exp],
mode='markers', line=dict(color=col), name=exp))
fig.show()
# stacked bar graph for all genotypes
df_averageWeightPerGenotype_all_rev = df_averageWeightPerGenotype_all.sort_values('bmID', ascending=False)
# plotly graph for all genotypes
colors = px.colors.qualitative.Plotly
layout = go.Layout(
legend=dict(
xanchor='right',
x=1,
yanchor='top',
y=1,
orientation='v',
title_font_family='Tahoma',
font=dict(family='Tahoma', size=9, color='black'),
bgcolor='lightsteelblue', bordercolor='grey', borderwidth=1
),
width=800, #3200 is a good size for all labels - when dtick = 1
height=2500,
title='Tuber weight per genotype (averaged by plant)',
xaxis={'title': 'plant tuber weight (g)',
'dtick':200, 'tickangle': 60,
#'rangemode': 'tozero', 'range': [-2, len(df_averageWeightPerGenotype_all) + 2]
},
yaxis={'title':'Genotype', 'dtick': 1, 'tickfont':dict(family='Tahoma', size=7, color='black')},
margin=dict(l=20, r=0, t=40, b=10),
paper_bgcolor='lightsteelblue',
)
fig = go.Figure(layout=layout)
for (exp, col) in zip(['1999NL', '2003VE', '2004Fin', '2005Fin', '2010ET'], colors[:5]):
fig.add_trace(go.Bar(
y=df_averageWeightPerGenotype_all_rev['bmID'],
x=df_averageWeightPerGenotype_all_rev[exp],
width=0.8,
name=exp,
orientation='h',
opacity=0.6,
))
fig.update_layout(barmode='stack')
fig.show()
fig.write_html('stacked_bars.html')
fig.write_image('stacked_bars.svg')
For Netherlands 1999 (df_weather_1999NL):
# 1999NL weather data
datatable_query = (
'''
PREFIX geo: <http://www.w3.org/2003/01/geo/wgs84_pos#>
PREFIX ssn: <http://purl.oclc.org/NET/ssnx/ssn#>
PREFIX w3ctime: <http://www.w3.org/2006/time#>
PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
PREFIX aemet: <http://aemet.linkeddata.es/ontology/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
SELECT distinct ?wsName ?date ?hours_of_sunlight ?mean_temperature
WHERE {
SERVICE <http://localhost:3030/weather/query> {
?ws a aemet:WeatherStation ;
aemet:stationName ?wsName .
?obs1 a aemet:Observation ;
aemet:valueOfObservedData ?hours_of_sunlight ;
ssn:observedBy ?ws;
aemet:observedInInterval ?date ;
ssn:observedProperty ?var1 .
?obs2 a aemet:Observation ;
aemet:valueOfObservedData ?mean_temperature ;
ssn:observedBy ?ws;
aemet:observedInInterval ?date ;
ssn:observedProperty ?var2 .
?var1 rdfs:label "Hours of sunlight" .
?var2 rdfs:label "Mean temperature" .
#FILTER(?wsName IN('Holeta station'))
#FILTER(?wsName IN('Merida station'))
#FILTER(?wsName IN('Ruuki station') && (?date < "2005-01-01"^^xsd:date))
#FILTER(?wsName IN('Ruuki station') && (?date > "2005-01-01"^^xsd:date))
FILTER(?wsName IN('WUR station'))
}
} ORDER BY ASC(?wsName) ASC(?date)
''')
result = query_DB(datatable_query)
columnsToKeep = ['wsName.value', 'date.value', 'hours_of_sunlight.value', 'mean_temperature.value']
df_weather_1999NL = result[columnsToKeep]
for c in columnsToKeep:
df_weather_1999NL.rename(inplace=True, columns={c: c[:-6]})
For Venezuela 2003 (df_weather_2003VE):
# 2003VE weather data
datatable_query = (
'''
PREFIX geo: <http://www.w3.org/2003/01/geo/wgs84_pos#>
PREFIX ssn: <http://purl.oclc.org/NET/ssnx/ssn#>
PREFIX w3ctime: <http://www.w3.org/2006/time#>
PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
PREFIX aemet: <http://aemet.linkeddata.es/ontology/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
SELECT distinct ?wsName ?date ?hours_of_sunlight ?mean_temperature
WHERE {
SERVICE <http://localhost:3030/weather/query> {
?ws a aemet:WeatherStation ;
aemet:stationName ?wsName .
?obs1 a aemet:Observation ;
aemet:valueOfObservedData ?hours_of_sunlight ;
ssn:observedBy ?ws;
aemet:observedInInterval ?date ;
ssn:observedProperty ?var1 .
?obs2 a aemet:Observation ;
aemet:valueOfObservedData ?mean_temperature ;
ssn:observedBy ?ws;
aemet:observedInInterval ?date ;
ssn:observedProperty ?var2 .
?var1 rdfs:label "Hours of sunlight" .
?var2 rdfs:label "Mean temperature" .
#FILTER(?wsName IN('Holeta station'))
FILTER(?wsName IN('Merida station'))
#FILTER(?wsName IN('Ruuki station') && (?date < "2005-01-01"^^xsd:date))
#FILTER(?wsName IN('Ruuki station') && (?date > "2005-01-01"^^xsd:date))
#FILTER(?wsName IN('WUR station'))
}
} ORDER BY ASC(?wsName) ASC(?date)
''')
result = query_DB(datatable_query)
columnsToKeep = ['wsName.value', 'date.value', 'hours_of_sunlight.value', 'mean_temperature.value']
df_weather_2003VE = result[columnsToKeep]
for c in columnsToKeep:
df_weather_2003VE.rename(inplace=True, columns={c: c[:-6]})
For Finland 2004 (df_weather_2004Fin):
# 2004Fin weather data
datatable_query = (
'''
PREFIX geo: <http://www.w3.org/2003/01/geo/wgs84_pos#>
PREFIX ssn: <http://purl.oclc.org/NET/ssnx/ssn#>
PREFIX w3ctime: <http://www.w3.org/2006/time#>
PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
PREFIX aemet: <http://aemet.linkeddata.es/ontology/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
SELECT distinct ?wsName ?date ?hours_of_sunlight ?mean_temperature
WHERE {
SERVICE <http://localhost:3030/weather/query> {
?ws a aemet:WeatherStation ;
aemet:stationName ?wsName .
?obs1 a aemet:Observation ;
aemet:valueOfObservedData ?hours_of_sunlight ;
ssn:observedBy ?ws;
aemet:observedInInterval ?date ;
ssn:observedProperty ?var1 .
?obs2 a aemet:Observation ;
aemet:valueOfObservedData ?mean_temperature ;
ssn:observedBy ?ws;
aemet:observedInInterval ?date ;
ssn:observedProperty ?var2 .
?var1 rdfs:label "Hours of sunlight" .
?var2 rdfs:label "Mean temperature" .
#FILTER(?wsName IN('Holeta station'))
#FILTER(?wsName IN('Merida station'))
FILTER(?wsName IN('Ruuki station') && (?date < "2005-01-01"^^xsd:date))
#FILTER(?wsName IN('Ruuki station') && (?date > "2005-01-01"^^xsd:date))
#FILTER(?wsName IN('WUR station'))
}
} ORDER BY ASC(?wsName) ASC(?date)
''')
result = query_DB(datatable_query)
columnsToKeep = ['wsName.value', 'date.value', 'hours_of_sunlight.value', 'mean_temperature.value']
df_weather_2004Fin = result[columnsToKeep]
for c in columnsToKeep:
df_weather_2004Fin.rename(inplace=True, columns={c: c[:-6]})
df_weather_2004Fin.head()
For Finland 2005 (df_weather_2005Fin):
# 2005Fin weather data
datatable_query = (
'''
PREFIX geo: <http://www.w3.org/2003/01/geo/wgs84_pos#>
PREFIX ssn: <http://purl.oclc.org/NET/ssnx/ssn#>
PREFIX w3ctime: <http://www.w3.org/2006/time#>
PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
PREFIX aemet: <http://aemet.linkeddata.es/ontology/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
SELECT distinct ?wsName ?date ?hours_of_sunlight ?mean_temperature
WHERE {
SERVICE <http://localhost:3030/weather/query> {
?ws a aemet:WeatherStation ;
aemet:stationName ?wsName .
?obs1 a aemet:Observation ;
aemet:valueOfObservedData ?hours_of_sunlight ;
ssn:observedBy ?ws;
aemet:observedInInterval ?date ;
ssn:observedProperty ?var1 .
?obs2 a aemet:Observation ;
aemet:valueOfObservedData ?mean_temperature ;
ssn:observedBy ?ws;
aemet:observedInInterval ?date ;
ssn:observedProperty ?var2 .
?var1 rdfs:label "Hours of sunlight" .
?var2 rdfs:label "Mean temperature" .
#FILTER(?wsName IN('Holeta station'))
#FILTER(?wsName IN('Merida station'))
#FILTER(?wsName IN('Ruuki station') && (?date < "2005-01-01"^^xsd:date))
FILTER(?wsName IN('Ruuki station') && (?date > "2005-01-01"^^xsd:date))
#FILTER(?wsName IN('WUR station'))
}
} ORDER BY ASC(?wsName) ASC(?date)
''')
result = query_DB(datatable_query)
columnsToKeep = ['wsName.value', 'date.value', 'hours_of_sunlight.value', 'mean_temperature.value']
df_weather_2005Fin = result[columnsToKeep]
for c in columnsToKeep:
df_weather_2005Fin.rename(inplace=True, columns={c: c[:-6]})
df_weather_2005Fin.head()
For Ethiopia 2010 (df_weather_2010ET):
# 2010ET weather data
datatable_query = (
'''
PREFIX geo: <http://www.w3.org/2003/01/geo/wgs84_pos#>
PREFIX ssn: <http://purl.oclc.org/NET/ssnx/ssn#>
PREFIX w3ctime: <http://www.w3.org/2006/time#>
PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
PREFIX aemet: <http://aemet.linkeddata.es/ontology/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
SELECT distinct ?wsName ?date ?hours_of_sunlight ?mean_temperature
WHERE {
SERVICE <http://localhost:3030/weather/query> {
?ws a aemet:WeatherStation ;
aemet:stationName ?wsName .
?obs1 a aemet:Observation ;
aemet:valueOfObservedData ?hours_of_sunlight ;
ssn:observedBy ?ws;
aemet:observedInInterval ?date ;
ssn:observedProperty ?var1 .
?obs2 a aemet:Observation ;
aemet:valueOfObservedData ?mean_temperature ;
ssn:observedBy ?ws;
aemet:observedInInterval ?date ;
ssn:observedProperty ?var2 .
?var1 rdfs:label "Hours of sunlight" .
?var2 rdfs:label "Mean temperature" .
FILTER(?wsName IN('Holeta station'))
#FILTER(?wsName IN('Merida station'))
#FILTER(?wsName IN('Ruuki station') && (?date < "2005-01-01"^^xsd:date))
#FILTER(?wsName IN('Ruuki station') && (?date > "2005-01-01"^^xsd:date))
#FILTER(?wsName IN('WUR station'))
}
} ORDER BY ASC(?wsName) ASC(?date)
''')
result = query_DB(datatable_query)
columnsToKeep = ['wsName.value', 'date.value', 'hours_of_sunlight.value', 'mean_temperature.value']
df_weather_2010ET = result[columnsToKeep]
for c in columnsToKeep:
df_weather_2010ET.rename(inplace=True, columns={c: c[:-6]})
df_weather_2010ET.head()
Beta thermal days: $\Large g \left ( T \right) = \left [ \left (\frac{T_c - T}{T_c-T_0} \right ) \cdot \left (\frac{T - T_b}{T_0-T_b} \right ) ^{\frac{T_0 - T_b}{T_c - T_0}} \right ]^{c_t}$
# calculate PBTT
df_weather_2003VE, cumPBTT_2003VE = calculate_PBTT(df_weather_2003VE)
df_weather_2004Fin, cumPBTT_2004Fin = calculate_PBTT(df_weather_2004Fin)
df_weather_2005Fin, cumPBTT_2005Fin = calculate_PBTT(df_weather_2005Fin)
df_weather_2010ET, cumPBTT_2010ET = calculate_PBTT(df_weather_2010ET)
df_weather_1999NL, cumPBTT_1999NL = calculate_PBTT(df_weather_1999NL)
# plotly graph for overlapping genotypes
layout = go.Layout(
width=500,
height=300,
title='Cumulative photo-beta thermal time per study',
xaxis={'title':'study', 'dtick':1, 'tickangle': 45, },
yaxis={'title':'cumulative PBTT', 'dtick': 10},
margin=dict(l=20, r=10, t=40, b=10),
paper_bgcolor='lightsteelblue',
)
studies = ['1999NL', '2003VE', '2004Fin', '2005Fin', '2010ET']
values = [cumPBTT_1999NL, cumPBTT_2003VE, cumPBTT_2004Fin, cumPBTT_2005Fin, cumPBTT_2010ET]
values = [round(v,2) for v in values]
cpbtt = {'1999NL': values[0], '2003VE': values[1], '2004Fin': values[2],
'2005Fin': values[3], '2010ET': values[4]}
fig = go.Figure(layout=layout)
fig.add_trace(go.Bar(
x=studies,
y=[cpbtt[k] for k in studies],
text=[round(v,2) for v in values],
textposition='auto',
marker_color=colors[:5],
opacity = 0.8
))
fig.show()
paula_cpbtt = {'1999NL': 48.75, '2003VE': 22.35, '2004Fin': 32.44, '2005Fin': 40.85, '2010ET': 24.62}
print('Cumulative PBTT differences from Paula\'s (Paula\'s - calculated here):')
for (s, pVal, eVal) in zip(studies, [paula_cpbtt[k] for k in studies], values):
print('For {:>7}, {:4.2f} - {:4.2f} = {:+4.2f}'.format(s, pVal, eVal, round(pVal - eVal,2)))
# add columns to df, with values and index of min / max
df_performance = df_averageWeightPerGenotype_all.set_index('bmID', inplace=False)
minCol = df_performance.idxmin(axis=1)
df_performance['max_study'] = df_performance.idxmax(axis=1)
df_performance['max_val'] = df_performance.max(axis=1)
df_performance['min_val'] = df_performance.min(axis=1)
df_performance['min_study'] = minCol
#list(df_performance.columns.values) - sorted in order of ascending cPBTT:
# ['2003VE', '2010ET', '2004Fin', '2005Fin', '1999NL']
df_perf_ordered = df_performance[['2003VE', '2010ET', '2004Fin', '2005Fin', '1999NL']]
# plotly graph for performance
layout = go.Layout(
# legend=dict(
# x=0, y=1,
# orientation='h',
# title_font_family='Tahoma',
# font=dict(family='Tahoma', size=9, color='black'),
# bgcolor='lightsteelblue', bordercolor='grey', borderwidth=1
# ),
width=1000,
height=450,
title='Tuber weight per genotype (averaged by plant)',
xaxis={'title':'cumulative PBTT', 'tickangle': 85},
yaxis={'title':'plant tuber weight (g)', 'dtick': 200},
margin=dict(l=20, r=0, t=40, b=10),
paper_bgcolor='lightsteelblue',
)
fig = go.Figure(layout=layout)
for index, row in df_performance.iterrows():
if cpbtt[row['min_study']] > cpbtt[row['max_study']]:
col = 'darkred'
else:
col= 'darkgreen'
label = index + ': ' + row['max_study'] + '-' + row['min_study']
fig.add_traces(go.Scatter(x=[cpbtt[row['min_study']], cpbtt[row['max_study']]],
y=[row['min_val'],row['max_val']],
line=dict(color=col),
name=label
))
fig.show()
# plotly graph for performance
layout = go.Layout(
# legend=dict(
# x=0, y=1,
# orientation='h',
# title_font_family='Tahoma',
# font=dict(family='Tahoma', size=9, color='black'),
# bgcolor='lightsteelblue', bordercolor='grey', borderwidth=1
# ),
width=1000,
height=450,
title='Tuber weight per genotype (averaged by plant)',
xaxis={'title':'cumulative PBTT', 'tickangle': 85},
yaxis={'title':'plant tuber weight (g)', 'dtick': 200},
margin=dict(l=20, r=0, t=40, b=10),
paper_bgcolor='lightsteelblue',
)
fig = go.Figure(layout=layout)
studies_ordered = ['2003VE', '2010ET', '2004Fin', '2005Fin', '1999NL']
cpbtt_labels = [cpbtt[s] for s in studies_ordered]
at_least_2 = []
at_least_3 = []
at_least_4 = []
at_least_5 = []
points = []
monotonic_counter = 0
for index, row in df_performance.iterrows():
values_ordered = [row[s] for s in studies_ordered]
comparable_values = [v for v in values_ordered if not pd.isnull(v)]
if monotonic(comparable_values):
col = 'darkgreen'
if len(comparable_values) > 4: at_least_5.append(index)
elif len(comparable_values) > 3: at_least_4.append(index)
elif len(comparable_values) > 2: at_least_3.append(index)
elif len(comparable_values) > 1: at_least_2.append(index)
else: points.append(index)
monotonic_counter += 1
else:
col= 'darkred'
label = index #+ ': ' + row['max_study'] + '-' + row['min_study']
fig.add_traces(go.Scatter(x=cpbtt_labels,
y=values_ordered,
line=dict(color=col),
name=label
))
fig.show()
print('1: ' + str(len(points)))
print('2: ' + str(len(at_least_2)))
print('3: ' + str(len(at_least_3)))
print('4: ' + str(len(at_least_4)) + ' ' + str(at_least_4))
print('5: ' + str(len(at_least_5)))
print(monotonic_counter)